import os
import re
import sqlite3
import datetime
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import cross_validation 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier


database = "C:\\Users\\Daniel\\Dropbox\\Journal (II)\\Replication Files\\Arel-Bundock, Vincent Replication\\replication\\replication_20160118_v2.sqlite"
conn = sqlite3.connect(database)

def get_data(sample, predictors, variables):
    wdi = pd.read_sql('SELECT * FROM wdi', conn)
    guo = pd.read_sql('SELECT * FROM guo', conn)
    sample = pd.read_sql('SELECT * FROM ' + sample, conn)
    distance = pd.read_sql('SELECT * FROM distance', conn)
    political = pd.read_sql('SELECT * FROM predictors_' + predictors, conn)
    dat = pd.merge(sample, guo, how='left')
    dat = pd.merge(dat, distance, how='left')
    dat = pd.merge(dat, wdi, left_on='iso2c_sub', right_on='iso2c')
    dat = pd.merge(dat, political, left_on='iso2c_sub', right_on='iso2c')
    variables = variables + political.columns.tolist()[1:]
    variables = '|'.join(variables)
    cols = [x for x in dat.columns if re.search(variables, x) != None]
    dat = dat[cols]
    dat = dat.set_index(['iso2c_guo', 'id_guo', 'iso2c_sub']).sort_index().reset_index()
    return dat

def get_arrays(dat):
    dat = dat.dropna()
    # weights
    tau = dat.sub_dummy_host.sum() / (len(dat.id_guo.unique()) * len(dat.iso2c_sub.unique()))
    ybar = dat.sub_dummy_host.mean()
    w1 = tau / ybar
    w0 = (1 - tau) / (1 - ybar)
    w = np.where(dat.sub_dummy_host == 1, w1, w0)
    # variables
    political = pd.read_csv('codebook.csv')
    political = political.variable[political.political==1].tolist()
    political = '|'.join(political)
    # arrays
    y = dat.sub_dummy_host.as_matrix()
    X = dat.iloc[:, 4:]
    X_apol = X[[x for x in X.columns if re.search(political, x) == None]]
    variables_pol = X.columns.tolist()
    variables_apol = X_apol.columns.tolist()
    X_pol = X.as_matrix()
    X_apol = X_apol.as_matrix()
    arrays = cross_validation.train_test_split(y, w, X_pol, X_apol, train_size=.5, random_state=1024)
    labels = ['y_train', 'y_test', 'w_train', 'w_test', 'X_pol_train',
              'X_pol_test', 'X_apol_train', 'X_apol_test']
    out = dict(zip(labels, arrays))
    out['variables_pol'] = variables_pol
    out['variables_apol'] = variables_apol
    return out

def estimate(arrays, cls):
    cls.random_state = 1024
    cls.fit(arrays['X_apol_train'], arrays['y_train'])
    score_apol = cls.score(arrays['X_apol_test'], arrays['y_test'])
    cls.fit(arrays['X_pol_train'], arrays['y_train'])
    score_pol = cls.score(arrays['X_pol_test'], arrays['y_test'])
    results = pd.DataFrame([score_apol, score_pol]).T
    results.columns = ['score_apol', 'score_pol']
    return results

class Logistic():
    def fit(self, X, y):
        self.cls = sm.Logit(y, X).fit()
    def score(self, X, y):
        yhat = np.where(self.cls.predict(X) > .5, 1, 0)
        score = np.mean(yhat == y)
        return score

classifiers = {'Random Forest':RandomForestClassifier(n_estimators=50, random_state=1024),
               'Extremely Randomized Trees':ExtraTreesClassifier(n_estimators=50, random_state=1024),
               'Ada Boost':AdaBoostClassifier(random_state=1024),
               'Bagging':BaggingClassifier(random_state=1024),
               'Logistic':Logistic()
               }

tables = ['sample', 'sample_klarge', 'sample_hi', 'sample_lo', 'sample_havens',
          'sample_outliers', 'sample_strata']
variables = ['id_guo', 'iso2c.guo', 'iso2c.sub', 'iso_count', 'sub_dummy_host',
             'naics\d', 'distw', '^gdp$']
variables_distance = variables + ['economic_distance', 'administrative_distance']
variables_wdi = variables + ['gdppc', 'pop', 'pop_urban', 'telephone']
variables_nodistance = [x for x in variables if x != 'distw']

print('data\n')
data = {'baseline' : get_data('sample', 'ksmall', variables)}

print('arrays\n')
arrays = {k: get_arrays(data[k]) for k in data.keys()}

# IMPORTANCE
arr = get_arrays(data['baseline'])
cls = RandomForestClassifier(n_estimators=50, random_state=1024)
cls.fit(arr['X_pol_train'], arr['y_train'],
         sample_weight=arr['w_train'])
importance = pd.DataFrame([arr['variables_pol'],
                          np.ravel(cls.feature_importances_).tolist()]).T
importance.columns = ['Variable', 'Importance']
importance.to_csv('tables\\figure3.csv', index=False)
